Home Credit Python Scoring for Collections </font>
Model Evaluation and Comparison Workflow v.0.8.1</font>
Copyright:
© 2017-2020, Pavel Sůva, Marek Teller, Martin Kotek, Jan Zeller, Marek Mukenšnabl, Kirill Odintsov, Jan Hynek, Elena Kuchina, Lubor Pacák, Naďa Horká and Home Credit & Finance Bank Limited Liability Company, Moscow, Russia – all rights reserved
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
For list of contributors see Gitlab page
import time
import datetime
import glob
import operator
import math
import random
import numpy as np
import pandas as pd
from datetime import datetime
# # import cx_Oracle
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os.path
import pickle
import gc
# check your tqdm version if import fails
from tqdm.notebook import tqdm
# # import tkinter
import sys
sys.path.insert(0, "..")
import scoring
from matplotlib.ticker import FuncFormatter
from matplotlib import gridspec
# from sklearn.metrics import roc_auc_score
from sklearn import metrics
from ipywidgets import FloatProgress
# import category_encoders as ce
# import graphviz
# import json
# import warnings
import multiprocessing as mp
from multiprocessing import Pool
from scipy.stats import gaussian_kde
import plotly.express as px
sns.set()
%matplotlib inline
%config InlineBackend.close_figures=True
from IPython.display import display, Markdown, HTML
pd.options.display.max_columns = None
pd.options.display.max_rows = 30
output_folder = 'documentation_evaluation_demo'
if not os.path.exists(output_folder): os.makedirs(output_folder)
if not os.path.exists(output_folder+'/performance'): os.makedirs(output_folder+'/performance')
if not os.path.exists(output_folder+'/daily_bootstrap'): os.makedirs(output_folder+'/daily_bootstrap')
scoring.check_version('0.8.0')
For the evaluation, the dataset with all the out of time data for both workflows is needed to upload. E.g. you can upload the transformation z dataset.
For High and Low, the respective scoring models have to be uploaded as well, to be able to score the Low dataset with High model and vice versa. It is also possible to import new dataset and score by all three models.
from scoring import db
data = db.read_csv(
r"coll_demo_data\dataset_scored_z_demo.csv",
sep=",",
decimal=".",
optimize_types=True,
encoding="utf-8",
# index_col="ID",
low_memory=False,
keep_default_na=False,
na_values=[""],
)
print("Data loaded on", datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M:%S"))
Loading the Main Banking Product if it is not part of the loaded dataset:
### THESE COLUMNS MUST BE INCLUDED IN THE DATA SET ###
# name of your target column in your dataset
col_target_orig = "TARGET_10D"
col_target = 'TARGET_Z'
# name of the time column in your dataset
col_time = "STARTDATE"
# name of the workflow column - usually Low, High, Medium etc.
col_workflow = 'PROCESS_NAME'
col_treatment = 'HIGHER_TREATMENT'
# name of the product column - e.g. CASH/CONSUMER
col_product = 'TYPEOCREDIT'
### THESE COLUMNS DON'T HAVE TO BE INCLUDED IN THE DATA SET AND ARE CREATED AUTOMATICALLY LATER with this given name ###
#name of the base column
col_base = "BASE"
# name of the year column
col_year = "YEAR"
# name of the month column
col_month = "MONTH"
# name of the day column
col_day = "DAY"
# name of the year and week column
col_week = "WEEK"
col_instalment = 'AMTINSTALMENT'
col_receivable = 'AMT_RECEIVABLE'
# #name of the weight column
col_weight = 'WEIGHT'
col_score = 'SCORE'
col_workflow_high = 'HIGHER_TREATMENT'
We will use the same masks as were created in the Python Scoring Workflow step, for identifying Train, Valid, Test and OOT
train_mask = (data["data_type"] == "train") & (data[col_base] == 1)
valid_mask = (data["data_type"] == "valid") & (data[col_base] == 1)
test_mask = (data["data_type"] == "test") & (data[col_base] == 1)
oot_mask = (data["data_type"] == "oot") & (data[col_base] == 1)
hoot_mask = (data["data_type"] == "hoot") & (data[col_base] == 1)
observable_mask = data[col_base] == 1
# for upload of new, Out-of-Time dataset for evaluation:
# oot_mask = data[col_base] == 1
Datetime Format Preparation
dtime_input_format = '%Y-%m-%d'
col_day = 'day'
data[[col_time,'PAIDDATE']] = data[[col_time,'PAIDDATE']].apply(pd.to_datetime, format=dtime_input_format, cache=False)
# data[col_time] = pd.to_datetime(data[col_time], format=dtime_input_format, cache=False)
data[col_time] = data[col_time].apply(lambda x: pd.to_datetime(str(x), format='%Y-%m-%d'))
data['day'] = data[col_time].apply(lambda x: x.to_pydatetime().date().timetuple().tm_yday) #<--day
As we created two distinct models for High and Low treatments in the Python Scoring Workflow, we have to upload and score the whole dataset for the contracts which were in the other treatment. That means, score Low treatment contracts by High model and vice versa. We will do it on our Z_score dataset. Please be aware that if you created any feature-engineering or interactions predictors for any of the models, you have to include the same for the dataset with both treatments, too.
from scoring.data_manipulation import split_predictors_bytype
cols_pred = list(pd.read_csv(r'coll_demo_data/predictors.csv', sep = ',', decimal = '.',
encoding = 'windows-1251', low_memory = False, header = None)[0])
cols_pred, cols_pred_num, cols_pred_cat = split_predictors_bytype(data,
pred_list=cols_pred,
non_pred_list= [],
optimize_types=True,
convert_bool2int=True)
num_to_cat = ['SHOPPER', 'CODEREGIONCLIENT']
cat_to_num = []
to_delete_category = ['AMTBALANCEACTUALCONTRACT', 'AMTINCOMEHOUSEHOLD']
print('Initial number of numerical predictors:',len(cols_pred_num))
print('Initial number of categorical predictors:',len(cols_pred_cat))
for i in num_to_cat:
if i in cols_pred_num:
cols_pred_num.remove(i)
cols_pred_cat.append(i)
if data[i].dtypes not in {'object', 'string', 'category'}:
try:
data[i] = data[i].astype('category')
print('Predictor ' + str(i) + ' was moved to categorical predictors.')
except:
data[i] = data[i].astype(str)
else:
print('Predictor '+ str(i) + ' not in numerical predictors.')
for i in cat_to_num:
if i in cols_pred_num:
cols_pred_cat.remove(i)
cols_pred_num.append(i)
if not pd.api.types.is_numeric_dtype(data[i].values.dtypes):
try:
data[i].astype(np.number)
data[i] = data[i].astype(get_optimal_numerical_type(i))
print('Predictor' + str(i) + ' was moved to numerical predictors.')
except:
print('Column {0} couldn\'t be converted to numerical. Will be used as categorical.'.format(name))
cols_pred_num.remove(i)
cols_pred_cat.append(i)
else:
print('Predictor '+ str(i) + ' not in numerical predictors.')
if len(num_to_cat + cat_to_num) > 0:
print('Category for some predictors has changed.')
for i in to_delete_category:
if i in cols_pred_num:
cols_pred_num.remove(i)
print('Predictor ' + str(i) + ' was deleted from numerical predictors.')
elif i in cols_pred_cat:
cols_pred_cat.remove(i)
print('Predictor ' + str(i) + ' was deleted from categorical predictors.')
else:
print('Predictor ' + str(i) + ' was not found in predictors.')
print('Updated number of numerical predictors:',len(cols_pred_num))
print('Updated number of categorical predictors:',len(cols_pred_cat))
cols_pred = cols_pred_num + cols_pred_cat
The evaluation is based on the comparison of results for multiple models. In this workflow, we can directly use
It is possible and viable to comment out all the models which you do not need. The easiest way is by Ctrl+/.
from scoring.grouping import Grouping
grouping_trans = r"coll_demo_data/mg_demo_z.json"
grouping_z = Grouping(columns = sorted(cols_pred_num),
cat_columns = sorted(cols_pred_cat),
group_count=5,
min_samples=100,
min_samples_cat=100)
grouping_z.load(grouping_trans)
data_woe = grouping_z.transform(data, columns_to_transform=grouping_z.bins_data_.keys(), transform_to="woe", progress_bar=True)
woe_columns_to_replace = list()
for column in data_woe.columns:
if column in data:
woe_columns_to_replace.append(column)
print("Column", column, "dropped as it already existed in the data set.")
data = data.drop(woe_columns_to_replace, axis="columns")
data = data.join(data_woe)
del data_woe
gc.collect()
print("Added WOE variables. Number of columns:", data.shape[1])
cols_woe = [s + "_WOE" for s in cols_pred]
Enter the Model Filename for Z-transformation
model_filename_z = r'coll_demo_data/myModelSW_demo_Z.model'
modelSW_Z = pickle.load(open(model_filename_z, 'rb'))
data['SCORE'] = modelSW_Z.predict(data)
data['UPLIFT_1MODEL'] = 2*data['SCORE']-1
col_score = 'UPLIFT_1MODEL'
# model_filename_lgbm = 'lgbm_final_model_20200611.model'
# modelSW_LGBM = pickle.load(open(model_filename_lgbm, 'rb'))
# data['SCORE_LGBM'] = modelSW_LGBM.predict(data[cols_pred], num_iteration = modelSW_LGBM.best_iteration)
# data['UPLIFT_LGBM_MODEL'] = 2*data['SCORE_LGBM']-1
# lgbm_score = 'UPLIFT_LGBM_MODEL'
Import the Grouping for High Treatment Model
grouping_high = r'coll_demo_data/mg_demo_high.json'
grouping_h = Grouping(columns = sorted(cols_pred_num),
cat_columns = sorted(cols_pred_cat),
group_count=5,
min_samples=100,
min_samples_cat=100)
grouping_h.load(grouping_high)
data_woe = grouping_h.transform(data, columns_to_transform=grouping_h.bins_data_.keys(), transform_to="woe", progress_bar=True)
woe_columns_to_replace = list()
for column in data_woe.columns:
if column in data:
woe_columns_to_replace.append(column)
print("Column", column, "dropped as it already existed in the data set.")
data = data.drop(woe_columns_to_replace, axis="columns")
data = data.join(data_woe)
del data_woe
gc.collect()
print("Added WOE variables. Number of columns:", data.shape[1])
cols_woe = [s + "_WOE" for s in cols_pred]
Enter the Model Filename for HIGH
model_filename_high = r'coll_demo_data\myModelSW_demo_HIGH.model'
modelSW_HIGH = pickle.load(open(model_filename_high, 'rb'))
high_score = 'SCORE_HIGH'
data[high_score] = modelSW_HIGH.predict(data)
Enter the Grouping Filename for LOW
grouping_low = r'coll_demo_data/mg_demo_low.json'
Enter the Model Filename for LOW
model_filename_low = r'coll_demo_data/myModelSW_demo_LOW.model'
modelSW_LOW = pickle.load(open(model_filename_low, 'rb'))
grouping_l = Grouping(columns = sorted(cols_pred_num),
cat_columns = sorted(cols_pred_cat),
group_count=5,
min_samples=100,
min_samples_cat=100)
grouping_l.load(grouping_low)
data_woe = grouping_l.transform(data, columns_to_transform=grouping_l.bins_data_.keys(), transform_to="woe", progress_bar=True)
woe_columns_to_replace = list()
for column in data_woe.columns:
if column in data:
woe_columns_to_replace.append(column)
print("Column", column, "dropped as it already existed in the data set.")
data = data.drop(woe_columns_to_replace, axis="columns")
data = data.join(data_woe)
del data_woe
gc.collect()
print("Added WOE variables. Number of columns:", data.shape[1])
cols_woe = [s + "_WOE" for s in cols_pred]
low_score = 'SCORE_LOW'
data[low_score] = modelSW_LOW.predict(data)
We are using here slightly different score_difference as usual. For the uplift (gain), we need
This can be achieved also as Probability of Unpaid in High - Probability of Unpaid in Low, as we are modelling the unpaid target.
col_score_diff = 'SCORE_DIFF'
data[col_score_diff] = data['SCORE_LOW'] - data['SCORE_HIGH']
Creates for each valid row a random number between -1 and 1.
# random uplift
col_uplift_random = 'UPLIFT_RANDOM'
data[col_uplift_random] = data[col_base].apply( lambda x: np.random.uniform(-1, 1))
Define the names of the ASB columns:
#
col_ASB = 'ASB_REAL'
col_ASB_diff_one = 'ASB_DIFF_TRANSFORMED'
col_ASB_diff_two = 'ASB_DIFF_TWOMODEL'
col_ASB_diff_rnd = 'ASB_DIFF_RANDOM'
col_ASB_diff_rec = 'ASB_DIFF_RECEIVABLE'
# col_ASB_diff_lgbm = 'ASB_DIFF_LGBM'
col_cost = 'WF_COST'
col_CAASB_OPT = 'WORKFLOW_CAASB_optimum'
col_CAASB = 'CAASB_REAL'
col_CAASB_diff_one = 'CAASB_DIFF_TRANS'
col_CAASB_diff_two = 'CAASB_DIFF_TWOMODEL'
col_CAASB_diff_rnd = 'CAASB_DIFF_RANDOM'
col_CAASB_diff_rec = 'CAASB_DIFF_RECEIVABLE'
# col_CAASB_diff_lgbm = 'CAASB_DIFF_LGBM'
# set the names of the instalment and receivable columns
col_instalment = 'AMTINSTALMENT'
col_receivable = 'AMT_RECEIVABLE'
# creating new columns, the optimal way
data.loc[:,col_ASB] = 0
data.loc[:,col_CAASB] = 0
data.loc[:,col_CAASB_OPT] = ''
col_product = 'MBP'
display(data[col_product].unique())
products_all = data[col_product].unique()
# RENAME the columns to the provision coefficients which you will use, your products will be created automatically
# TODO: How to do it in general??
provision_coeffs = pd.DataFrame(columns=['C_0', 'C_1'], index=products_all)
provision_coeffs.rename_axis('Banking Products', axis='rows', inplace=True)
provision_coeffs.rename_axis('Coefficients', axis='columns', inplace=True)
# Set the right ratio of provision coefficients for the CAASB computing, for all products:
# Set the correct coefficients' values
# for KZ, I've used the exponential trend from provisions 2020/01
provision_coeffs['C_0']['PoS'] = 0.04
provision_coeffs['C_0']['Cash Walk-in'] = 0.05
provision_coeffs['C_0']['Cash X-sell'] = 0.024
provision_coeffs['C_0']['RC'] = 0.05
provision_coeffs['C_1']['PoS'] = 0.13
provision_coeffs['C_1']['Cash Walk-in'] = 0.22
provision_coeffs['C_1']['Cash X-sell'] = 0.10
provision_coeffs['C_1']['RC'] = 0.10 # different computation
print('TABLE of Provision Coefficients')
display(provision_coeffs)
# COSTS for each workflow used in the model - first creating a dictionary:
workflows = data[col_workflow].unique()
costs_workflow = {name:0 for name in workflows}
costs_workflow[workflows[0]] = 20
costs_workflow[workflows[1]] = 50
display(costs_workflow)
data[col_cost] = data[col_workflow].apply(
lambda x: costs_workflow[workflows[0]] if x == workflows[0]
else costs_workflow[workflows[1]] )
# the column can get created as 'Category' instead of 'Number'
# therefore we simply retype it to optimal numerical type
data[col_cost] = data[col_cost].astype(np.number)
# data[col_cost] = data[col_cost].astype(get_optimal_numerical_type(data[col_cost]))
Computing CAASB (The Cost Adjusted Average Saved Balance): $$CAASB = ASB - costs$$ $$ASB = c_i \cdot R - \sum_{u=0}^{m} P_u \cdot c_u \cdot(R - (m-u) r)$$
For computing CAASB, we need to know the receivable and amount of instalment, provision coefficients and costs. Set the name of new columns and assign receivable and instalment amount to specific column.
Warning: This equation is one of the possible CAASB equations, which is computed at DPD = 1 with target date still in bucket 1.
$$ASB = c_0 \cdot R - \sum_{u=0}^{1} P_u \cdot c_u \cdot(R - (1-u) \cdot r)$$For different workflows, the equation can differ.
Before computing ASB, you should always check and eventually change the equations for Precollection or later stages according to the general equation above.
# COMPUTATION OF PRECO ASB -definitions
def get_asb_coefficient_p0(instalment, product, c_0, c_1):
asb_coef_p0 = (c_0[product.values]).astype(np.number)*instalment.values # c_0 * r
return asb_coef_p0
def get_asb_coefficient_p1(receivable, instalment, product, c_0, c_1):
# (c_0 - c_1)*R
asb_coef_p1 = (c_0[product.values] - c_1[product.values]).astype(np.number)*receivable.values
return asb_coef_p1
def get_asb_real(asb_coef_p0, asb_coef_p1, target):
asb_real = target*asb_coef_p1.values + (1-target)*asb_coef_p0.values
return asb_real
def get_asb_diff(asb_coef_p0, asb_coef_p1, score_diff):
asb_diff = - score_diff*asb_coef_p1.values + score_diff*asb_coef_p0.values
return asb_diff
def get_caasb_real(asb_real, costs_real):
caasb_real = asb_real.values - costs_real.values
return caasb_real
def get_caasb_diff(asb_diff, costs_all):
caasb_diff = asb_diff.values - costs_all['HIGH'] + costs_all['LOW']
return caasb_diff
asb_coef_p0 = get_asb_coefficient_p0(
data[col_instalment],
data[col_product],
provision_coeffs['C_0'],
provision_coeffs['C_1'])
asb_coef_p1 = get_asb_coefficient_p1(
data[col_receivable],
data[col_instalment],
data[col_product],
provision_coeffs['C_0'],
provision_coeffs['C_1'])
# real ASB and ASB differences
# commented out the models which we do not have
data[col_ASB] = get_asb_real(asb_coef_p0, asb_coef_p1, data[col_target_orig])
data[col_ASB_diff_one] = get_asb_diff(asb_coef_p0, asb_coef_p1, data[col_score])
data[col_ASB_diff_two] = get_asb_diff(asb_coef_p0, asb_coef_p1, data[col_score_diff])
data[col_ASB_diff_rnd] = get_asb_diff(asb_coef_p0, asb_coef_p1, data[col_uplift_random])
# data[col_ASB_diff_lgbm] = get_asb_diff(asb_coef_p0, asb_coef_p1, data[lgbm_score])
# real CAASB and CAASB differences
data[col_CAASB] = get_caasb_real(data[col_ASB], data[col_cost])
data[col_CAASB_diff_one] = get_caasb_diff(data[col_ASB_diff_one], costs_workflow)
data[col_CAASB_diff_two] = get_caasb_diff(data[col_ASB_diff_two], costs_workflow)
data[col_CAASB_diff_rnd] = get_caasb_diff(data[col_ASB_diff_rnd], costs_workflow)
# data[col_CAASB_diff_lgbm] = get_caasb_diff(data[col_ASB_diff_lgbm], costs_workflow)
# Adding sorting by Receivable
data[col_CAASB_diff_rec] = data[col_receivable]
Save the dataset with all scores and computed CAASB-diffs
savepath = os.path.join(output_folder,'dataset_demo_out_caasb.csv')
data.to_csv(savepath, encoding='utf-8', index=False)
Load Dataset with CAASB Computed
# path = os.path.join(output_folder,'dataset_demo_out_caasb.csv')
# data = db.read_csv(
# path,
# sep=",",
# decimal=".",
# optimize_types=True,
# encoding="utf-8",
# # index_col="ID",
# low_memory=False,
# keep_default_na=False,
# na_values=[""],
# )
# print("Data loaded on", datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M:%S"))
Set which models will be shown in the Impact Analysis
# order of contracts by the money (CAASB) impact
_uplift_metrics = [col_CAASB_diff_one,
col_CAASB_diff_two,
col_CAASB_diff_rnd,
col_CAASB_diff_rec,
# col_CAASB_diff_lgbm
]
# variant with the ordering by uplift/score_difference
_uplift_metrics_score = [col_score,
# lgbm_score,
col_score_diff,
col_receivable,
col_uplift_random]
We are using the uplift metrics based on:
https://tech.wayfair.com/data-science/2018/10/pylift-a-fast-python-package-for-uplift-modeling/
from scoring import coll_evaluation
impact_analysis = coll_evaluation.CollModelImpactAnalysis(data[oot_mask],
weight=col_weight,
base = col_base,
uplift_metrics=_uplift_metrics,
treatment=col_treatment,
outcome=col_target_orig,
target=col_target,
caasb=col_CAASB,
time=col_time,
n_bins=10,
n_bootstraps = 20,
n_histories=20,
use_caasb=True,
alpha=2,
)
Bootstrapped_CAASB_curves = impact_analysis.bootstrap_impact_analysis(by_day=None)
impact_analysis.plot_impact_analysis(Bootstrapped_CAASB_curves,
# savefile=os.path.join(output_folder,"avg_CAASB_predicted_demo.png")
)
This part simulates the daily segmentation with two possible settings:
For the day-by-day strategy, the ideal cutoff can be different from the overall cutoff displayed in the Impact Analysis Chapter. Therefore, here you have the possibility to decide by everyday results, what is the best performing cutoff.
It is possible and recommended to first run the simulations with dynamic cutoff and then after choosing the best performing, to run it again with a fixed cutoff to obtain the money saving results.
As Challengers, each set in the dictionary is one strategy
The _type is just a name for the strategy.
challengers = {}
challengers[0] = {'challengers' : ['CAASB_DIFF_TRANS'], '_type' : 'CAASB_DIFF_TRANS' }
challengers[1] = {'challengers' : ['CAASB_DIFF_TWOMODEL'], '_type' : 'CAASB_DIFF_TWOMODEL' }
# challengers[2] = {'challengers' : ['CAASB_DIFF_LGBM'], '_type' : 'CAASB_DIFF_LGBM' }
challengers[2] = {'challengers' : ['CAASB_DIFF_RANDOM'], '_type' : 'CAASB_DIFF_RANDOM' }
challengers[3] = {'challengers' : ['CAASB_DIFF_RECEIVABLE'], '_type' : 'CAASB_DIFF_RECEIVABLE' }
challengers[4]= {'challengers' : ['CAASB_DIFF_TRANS',
'CAASB_DIFF_TWOMODEL',
'CAASB_DIFF_RANDOM',
# 'CAASB_DIFF_LGBM',
'CAASB_DIFF_RECEIVABLE'],\
'_type' : 'all'}
# challengers[5]= {'challengers' : ['CAASB_DIFF_TRANS', 'CAASB_DIFF_TWOMODEL', 'CAASB_DIFF_RECEIVABLE'],\
# '_type' : 'three_models'}
Setting the Impact Analysis
For getting the overall impact analysis (bootstrapped or not), and for computing the historical daily distributions (again bootstrapped or not), we need to set following attributes:
The outcome is a basis for computing the random trajectories for cumulative impact of the strategies.
HISTORICAL_DAILY_WINNER_DISTRIBUTION = impact_analysis.get_hist_daily_winner_distribution(42, challengers,
dynamic_cutoff=True,
fixed_cutoff=0.2,
progress_bar=True)
# save the results
savepath = os.path.join(output_folder,'dataset_hist_daily_winner_demo.csv')
HISTORICAL_DAILY_WINNER_DISTRIBUTION.to_csv(savepath, encoding='utf-8', index=False)
# load the results
# path = os.path.join(output_folder,'dataset_hist_daily_winner_demo.csv')
# HISTORICAL_DAILY_WINNER_DISTRIBUTION = db.read_csv(
# path,
# sep=",",
# decimal=".",
# optimize_types=True,
# encoding="utf-8",
# # index_col="ID",
# low_memory=False,
# keep_default_na=False,
# na_values=[""],
# )
# print("Data loaded on", datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d %H:%M:%S"))
Create 'trajectories' from randomly chosen bootstraps for each day.
collection_winner_histories = impact_analysis.create_trajectories(HISTORICAL_DAILY_WINNER_DISTRIBUTION,
_seed=33,
challengers=challengers)
Choose your Challenger and Champion strategy
In order to be able to directly compare, we need to choose just two strategies. The one which we compare to the 'base' is Challenger, the base to which challenger is compared is Champion.
choices = []
for key in challengers.keys():
choices.append(challengers[key]['_type'])
print (choices)
challenger = choices[1] # challenger
champion = choices[3] # champion
print(f'challenger strategy: {challenger}')
print(f'champion strategy: {champion}')
trajectories_deltas_differences = impact_analysis.get_deltas_difference(collection_winner_histories,
challenger,
champion)
impact_analysis.plot_dynamic(trajectories_deltas_differences, challenger, champion, show='delta')
For having this plot functioning, you need to have a strategy in challengers with more than just one model (a list of models). Plot shows the ratio of winning approaches.
all_models = collection_winner_histories[collection_winner_histories['type'] == 'all']
impact_analysis.plot_winning(all_models, savefile=None)
winners = pd.DataFrame()
for metric in _uplift_metrics:
winners = winners.append(collection_winner_histories[collection_winner_histories['type'] == metric] )
impact_analysis.plot_cutoffs(winners, savefile=None)
impact_analysis.plot_caasb(winners, savefile=None)
impact_analysis.plot_distribution_caasb(trajectories_deltas_differences, challenger, champion, show='absolute',
animation=False, savefile=None)